Analysis of > 13000 free text responses to the question in Survey 5

if_you_could_improve_one_thing_about_the_website_what_would_it_be?

Method

  • responses extracted from spreadsheet
  • convert responses to document feature matrix (weighted by tf-idf - term frequency inverse document frequency)
  • reduce dimensionality and cluster
  • label clusters
  • hold out unclustered data
  • group categories into broad themes
  • build model to predict theme labels in clustered data
  • apply model to unclustered data
  • recombine
  • visualise results
library(tidyverse)
library(janitor)
library(myScrapers)
library(quanteda)
library(mclust)
library(parallel)
library(Rtsne)
library(dbscan)
library(quanteda.textmodels)

quanteda::quanteda_options(threads = 8)
c_survey <- read_csv("~/Documents/Covid-19 survey v0.05 (Responses) - Form Responses 1.csv") %>%
  clean_names()

free_text_1 <- c_survey %>%
  select(starts_with("if_you_could")) %>%
  filter(!is.na(if_you_could_improve_one_thing_about_the_website_what_would_it_be)) %>%
  mutate(pmid  = row_number(), 
         absText = if_you_could_improve_one_thing_about_the_website_what_would_it_be,
         title = pmid
  )

dim(free_text_1)

c_2 <- create_abstract_corpus(free_text_1)

c_2

Results

Create corpus

#create_abstract_cluster

 
## remove numbers 
corpus <- c_2$corpus %>% filter(!is.na(pmid), str_detect(word, "[[:alpha:]]" ))

## extract ids
pmid_1 <- pull(corpus, "pmid") %>% unique() %>% enframe()

free_text_1 <- free_text_1 %>%
  filter(pmid %in% pmid_1$value)


## reduce dimension
#tsne <- corpus %>% cast_sparse(pmid, word, tf_idf) %>% as.matrix() %>% 
  #Rtsne(check_duplicates = FALSE, perplexity = 50)

#tsne %>% saveRDS("tsne.rds")

Cluster and label

## load 
tsn <- read_rds("tsne.rds")

## dbscan method

set.seed(123)

## create cluster
clust <- hdbscan(tsn$Y, minPts = 60)

pairs(tsn$Y)

## combine datasets
clustering <- data.frame(cbind(pmid = pmid_1, tsn$Y, cluster = clust$cluster)) %>% 
  mutate(V2 = as.numeric(as.character(X1)), V3 = as.numeric(as.character(X2)))

clustering <- clustering %>% mutate(clustered = ifelse(cluster == 0, "not-clustered", "clustered"))

## create labels
labels <- create_cluster_labels(corpus, clustering, top_n = 6)

Classify

## classify statements
classification <- labels$results %>%
  left_join(free_text_1, by = c("pmid.value" = "pmid"))


# classification %>%
#   count(clus_names, sort = TRUE) 

## recode categories
categorised <- classification %>%
  mutate(broad_cat = case_when(str_detect(clus_names, "mobile|iPad") ~ "mobile", 
                               str_detect(clus_names, "vaccin") ~ "vaccination-data", 
                               str_detect(clus_names, "add") ~ "add-misc.", 
                               str_detect(clus_names, "zoom") ~ "zooming issues", 
                               #str_detect(clus_names, "ag") ~ "age-breakdowns", 
                               str_detect(clus_names, "region") ~ "regional data", 
                               str_detect(clus_names, "colour") ~ "colour-issues", 
                               str_detect(clus_names, "compar|previou") ~ "comparisons", 
                               str_detect(clus_names, "crash|fail|interact") ~ "map-stability", 
                               str_detect(clus_names, "detail") ~ "detail", 
                               str_detect(clus_names, "breakdown") ~ "breakdowns",
                               str_detect(clus_names, "excel") ~ "praise",
                               str_detect(clus_names, "death") ~ "deaths", 
                               str_detect(clus_names, "time|consist") ~ "get-figures-out-on-time", 
                               str_detect(clus_names, "local|msoa") ~ "granularity", 
                               str_detect(clus_names, "home") ~ "homepage", 
                               str_detect(clus_names, "date|frequent|lag") ~ "up-to-date|updates",
                               str_detect(clus_names, "link") ~ "awareness", 
                               str_detect(clus_names, "easi|navig") ~ "navigation", 
                               str_detect(clus_names, "graph") ~ "graph-issues", 
                               str_detect(clus_names, "pcr") ~ "testing-date",
                               str_detect(clus_names, "post-code|postcod") ~ "post-code-issues", 
                               str_detect(clus_names, "recov") ~ "recovery",
                               str_detect(clus_names, "rate") ~ "rates-R"))

Plot

library(treemap)
library(d3treeR)

### create scatter plot of responses


## calculcate median values of each cluster

cross_hairs <- classification %>%
  group_by(clus_names) %>%
  summarise(medX = median(V2), 
            medY = median(V3))


## plot scatter plot with colours for each cluster

classification %>%
  as_tibble() %>%
  ggplot(aes(V2, V3)) + 
  geom_jitter(aes(colour = clus_names), show.legend = FALSE, , alpha = 0.4) +
  geom_point(aes(medX, medY), data = cross_hairs , shape = "X", size = 5) +
  ggrepel::geom_text_repel(aes(label = clus_names, medX, medY), data = cross_hairs, size = 4) +
  theme(axis.text = element_blank(), 
        panel.background = element_blank()) +
  labs(title = "Estimated clustering of 13085 free text responses to\n'What one thing would you improve about the website'",
       x = "", 
       y = ""
       )

### table of responses in random samples from each cluster
classification %>%
  group_by(clus_names) %>%
  mutate(n = n()) %>%
  sample_n(5) %>%
  select(clus_names, absText, n) %>%
  DT::datatable()

test train split

library(rsample)


## holdout unclustered data
heldout <- categorised %>%
  filter(cluster == 0)

## training data
training <- categorised %>%
  filter(cluster != 0)

## test train split
split <- initial_split(training, strata = broad_cat)
train <- training(split)
test <- testing(split)

## create a document term matrix, remove common words, numbers, punctuation, apply stemming

train_dfm <- corpus(train, text_field = "absText") %>%
  dfm(., remove = stopwords("en"), remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

#docvars(train_dfm)

test_dfm <- corpus(test, text_field = "absText") %>%
  dfm(., remove = stopwords("en"), remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)

## build  model
mod1 <- textmodel_svm(train_dfm, y = docvars(train_dfm, "broad_cat"))

Model

## caret is modelling package - we are using to measure accuracy
library(caret)


## match features in test set to training set
dfm_match <- dfm_match(test_dfm, features = featnames(train_dfm)) 

## extract labels we are trying to predict
actual <- dfm_match$broad_cat

## predict labels
predicted <- predict(mod1, newdata = dfm_match)


## compare accuracy of predictions with labels
caret::confusionMatrix(table(actual, predicted)) %>%
  tidy() %>%
  filter(str_detect(term, "accuracy"))
## # A tibble: 17 x 6
##    term              class                   estimate conf.low conf.high p.value
##    <chr>             <chr>                      <dbl>    <dbl>     <dbl>   <dbl>
##  1 accuracy          <NA>                       0.934    0.922     0.944       0
##  2 balanced_accuracy awareness                  0.928   NA        NA          NA
##  3 balanced_accuracy colour-issues              1       NA        NA          NA
##  4 balanced_accuracy comparisons                0.948   NA        NA          NA
##  5 balanced_accuracy deaths                     0.960   NA        NA          NA
##  6 balanced_accuracy get-figures-out-on-time    0.966   NA        NA          NA
##  7 balanced_accuracy granularity                0.948   NA        NA          NA
##  8 balanced_accuracy graph-issues               0.899   NA        NA          NA
##  9 balanced_accuracy map-stability              0.962   NA        NA          NA
## 10 balanced_accuracy navigation                 0.938   NA        NA          NA
## 11 balanced_accuracy post-code-issues           0.986   NA        NA          NA
## 12 balanced_accuracy recovery                   0.971   NA        NA          NA
## 13 balanced_accuracy regional data              0.958   NA        NA          NA
## 14 balanced_accuracy testing-date               0.983   NA        NA          NA
## 15 balanced_accuracy up-to-date|updates         0.970   NA        NA          NA
## 16 balanced_accuracy vaccination-data           0.979   NA        NA          NA
## 17 balanced_accuracy zooming issues             0.986   NA        NA          NA

Apply to unclustered responses

held_dfm <- corpus(heldout, text_field = "absText") %>%
  dfm(., remove = stopwords("en"), remove_numbers = TRUE, remove_punct = TRUE, stem = TRUE)


total_match <- dfm_match(held_dfm, features = featnames(train_dfm))

## predict labels
predict_total <- predict(mod1, newdata = total_match)

## add predictions

predicted_data <- data.frame(heldout, predicted = predict_total)

glimpse(predicted_data)
## Rows: 4,850
## Columns: 14
## $ pmid.name                                                         <int> 1, …
## $ pmid.value                                                        <int> 1, …
## $ X1                                                                <dbl> -10…
## $ X2                                                                <dbl> 11.…
## $ cluster                                                           <dbl> 0, …
## $ V2                                                                <dbl> -10…
## $ V3                                                                <dbl> 11.…
## $ clustered                                                         <chr> "no…
## $ clus_names                                                        <chr> "va…
## $ if_you_could_improve_one_thing_about_the_website_what_would_it_be <chr> "ra…
## $ absText                                                           <chr> "ra…
## $ title                                                             <int> 1, …
## $ broad_cat                                                         <chr> "va…
## $ predicted                                                         <fct> vac…
glimpse(training)
## Rows: 8,235
## Columns: 13
## $ pmid.name                                                         <int> 3, …
## $ pmid.value                                                        <int> 4, …
## $ X1                                                                <dbl> 43.…
## $ X2                                                                <dbl> -20…
## $ cluster                                                           <dbl> 5, …
## $ V2                                                                <dbl> 43.…
## $ V3                                                                <dbl> -20…
## $ clustered                                                         <chr> "cl…
## $ clus_names                                                        <chr> "lo…
## $ if_you_could_improve_one_thing_about_the_website_what_would_it_be <chr> "Th…
## $ absText                                                           <chr> "Th…
## $ title                                                             <int> 4, …
## $ broad_cat                                                         <chr> "ma…
pred <- predicted_data %>%
  select(-broad_cat, broad_cat = predicted) %>%
  bind_rows(training) 

counts <- pred %>%
  count(broad_cat, clus_names) 

counts
##                  broad_cat
## 1                awareness
## 2                awareness
## 3            colour-issues
## 4            colour-issues
## 5              comparisons
## 6              comparisons
## 7              comparisons
## 8                   deaths
## 9                   deaths
## 10 get-figures-out-on-time
## 11 get-figures-out-on-time
## 12 get-figures-out-on-time
## 13 get-figures-out-on-time
## 14             granularity
## 15             granularity
## 16             granularity
## 17             granularity
## 18             granularity
## 19             granularity
## 20             granularity
## 21             granularity
## 22            graph-issues
## 23            graph-issues
## 24           map-stability
## 25           map-stability
## 26           map-stability
## 27           map-stability
## 28           map-stability
## 29           map-stability
## 30           map-stability
## 31           map-stability
## 32           map-stability
## 33           map-stability
## 34           map-stability
## 35           map-stability
## 36           map-stability
## 37              navigation
## 38              navigation
## 39              navigation
## 40        post-code-issues
## 41        post-code-issues
## 42                recovery
## 43                recovery
## 44           regional data
## 45           regional data
## 46            testing-date
## 47            testing-date
## 48      up-to-date|updates
## 49      up-to-date|updates
## 50      up-to-date|updates
## 51        vaccination-data
## 52        vaccination-data
## 53        vaccination-data
## 54        vaccination-data
## 55        vaccination-data
## 56        vaccination-data
## 57        vaccination-data
## 58        vaccination-data
## 59        vaccination-data
## 60        vaccination-data
## 61        vaccination-data
## 62        vaccination-data
## 63          zooming issues
## 64          zooming issues
## 65          zooming issues
## 66          zooming issues
## 67          zooming issues
##                                                         clus_names   n
## 1                                 link-site-websit-inform-map-data 563
## 2                                  vaccin-local-graph-dai-map-data 630
## 3                              colour-code-chang-rate-interact-map 209
## 4                                  vaccin-local-graph-dai-map-data  20
## 5                       comparison-previou-week-compar-averag-data  70
## 6                               previou-week-averag-dai-graph-data 245
## 7                                  vaccin-local-graph-dai-map-data 148
## 8                               excess-covid-death-includ-dai-data 142
## 9                                  vaccin-local-graph-dai-map-data 164
## 10                                 figur-time-daili-graph-dai-data 126
## 11                           real-lag-consist-time-report-map-data 133
## 12                             regular-consist-updat-time-dai-data 161
## 13                                 vaccin-local-graph-dai-map-data 222
## 14              histor-select-level-author-download-local-map-data 184
## 15                             inform-week-date-local-dai-map-data  80
## 16                         localis-easi-provid-clearer-inform-data 103
## 17                                 post-code-search-local-data-map 119
## 18                                 roll-rate-infect-local-map-data 143
## 19                     sourc-msoa-histor-download-displai-set-data 316
## 20                                 vaccin-local-graph-dai-map-data 852
## 21                               week-date-report-local-graph-data 321
## 22 logarithm-size-larger-log-confus-axi-scale-clearer-vertic-graph 100
## 23                                 vaccin-local-graph-dai-map-data   8
## 24                  constantli-lot-frequent-reload-phone-crash-map 209
## 25                            devic-mobil-phone-interact-graph-map 186
## 26                          doesn’t-ipad-reload-crash-interact-map 146
## 27                               excel-it’-updat-interact-map-data 116
## 28                          fail-frequent-reload-interact-map-data 113
## 29                                                    interact-map  65
## 30                      iphon-doesn’t-reload-io-crash-interact-map 109
## 31                        lot-constantli-reload-crash-interact-map  84
## 32                      reliabl-updat-crash-time-interact-map-data  67
## 33                             speed-faster-fail-load-interact-map 142
## 34                           stabil-improv-crash-interact-map-data 125
## 35                                stabl-reload-uk-bit-map-interact 151
## 36                                 vaccin-local-graph-dai-map-data 654
## 37                          navig-understand-easier-graph-map-data 185
## 38                            touch-axi-easi-read-scale-graph-data 196
## 39                                 vaccin-local-graph-dai-map-data 186
## 40                             rememb-town-search-postcod-map-data 256
## 41                                 vaccin-local-graph-dai-map-data 163
## 42                          recov-discharg-hospit-peopl-covid-data 377
## 43                                 vaccin-local-graph-dai-map-data  92
## 44                   breakdown-region-nation-compar-graph-data-map 131
## 45                                 vaccin-local-graph-dai-map-data  97
## 46                             pcr-percentag-test-posit-peopl-data 239
## 47                                 vaccin-local-graph-dai-map-data 156
## 48                              lag-averag-dai-date-map-graph-data  82
## 49                     notif-quicker-frequent-healthcar-updat-data 181
## 50                                 vaccin-local-graph-dai-map-data 222
## 51                      author-level-postcod-local-vaccin-data-map  76
## 52                          axi-add-inform-vaccin-graph-local-data 138
## 53                brand-administ-project-prioriti-dose-type-vaccin  61
## 54                             breakdown-band-ag-death-vaccin-data 363
## 55                       detail-inform-local-graph-map-vaccin-data 186
## 56                             home-page-summari-vaccin-graph-data 167
## 57                    homepag-total-weekli-daili-vaccin-graph-data 153
## 58              manufactur-homepag-api-link-includ-vaccin-map-data  88
## 59                       pfizer-dose-prioriti-progress-vaccin-data 106
## 60                popul-percentag-prioriti-total-peopl-vaccin-data 150
## 61                                 vaccin-local-graph-dai-map-data 892
## 62                            variant-info-date-local-vaccin-updat 145
## 63                               axi-function-abil-zoom-graph-date 108
## 64                                   bar-chart-zoom-graph-dai-data  71
## 65                    constantli-stop-crash-zoom-interact-time-map 152
## 66                         hard-devic-touch-mobil-phone-zoom-graph  96
## 67                                 vaccin-local-graph-dai-map-data 344

Treemap

palette = viridis::viridis(20)

t <- treemap(counts, 
               index = c("broad_cat", "clus_names"), 
               vSize = "n", 
               type = "index", 
               palette = palette)

d3tree3(t, rootname = "Grouped answers")

Sample

predicted_data %>%
  group_by(predicted) %>%
  sample_n(3) %>%
  select( absText, clus_names, predicted, broad_cat) %>%
  DT::datatable()